library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.4 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0
## Warning: package 'ggplot2' was built under R version 3.6.2
## Warning: package 'tibble' was built under R version 3.6.2
## Warning: package 'tidyr' was built under R version 3.6.2
## Warning: package 'readr' was built under R version 3.6.2
## Warning: package 'purrr' was built under R version 3.6.2
## Warning: package 'dplyr' was built under R version 3.6.2
## ── Conflicts ──────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
books <- read_csv("data/books.csv")
##
## ── Column specification ──────────────────────────────────────────────────────────────────────────────
## cols(
## bookID = col_double(),
## title = col_character(),
## authors = col_character(),
## average_rating = col_double(),
## isbn = col_character(),
## isbn13 = col_character(),
## language_code = col_character(),
## num_pages = col_double(),
## ratings_count = col_double(),
## text_reviews_count = col_double(),
## publication_date = col_character(),
## publisher = col_character()
## )
## Warning: 21 parsing failures.
## row col expected actual file
## 1570 title delimiter or quote 'data/books.csv'
## 1570 title delimiter or quote I 'data/books.csv'
## 3349 average_rating a double Jr./Sam B. Warner 'data/books.csv'
## 3349 num_pages a double en-US 'data/books.csv'
## 3349 NA 12 columns 13 columns 'data/books.csv'
## .... .............. .................. ................. ................
## See problems(...) for more details.
dim(books)
## [1] 8472 12
str(books)
## tibble [8,472 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ bookID : num [1:8472] 1 2 4 5 8 9 10 12 13 14 ...
## $ title : chr [1:8472] "Harry Potter and the Half-Blood Prince (Harry Potter #6)" "Harry Potter and the Order of the Phoenix (Harry Potter #5)" "Harry Potter and the Chamber of Secrets (Harry Potter #2)" "Harry Potter and the Prisoner of Azkaban (Harry Potter #3)" ...
## $ authors : chr [1:8472] "J.K. Rowling/Mary GrandPré" "J.K. Rowling/Mary GrandPré" "J.K. Rowling" "J.K. Rowling/Mary GrandPré" ...
## $ average_rating : num [1:8472] 4.57 4.49 4.42 4.56 4.78 3.74 4.73 4.38 4.38 4.22 ...
## $ isbn : chr [1:8472] "0439785960" "0439358078" "0439554896" "043965548X" ...
## $ isbn13 : chr [1:8472] "9780439785969" "9780439358071" "9780439554893" "9780439655484" ...
## $ language_code : chr [1:8472] "eng" "eng" "eng" "eng" ...
## $ num_pages : num [1:8472] 652 870 352 435 2690 ...
## $ ratings_count : num [1:8472] 2095690 2153167 6333 2339585 41428 ...
## $ text_reviews_count: num [1:8472] 27591 29221 244 36325 164 ...
## $ publication_date : chr [1:8472] "9/16/2006" "9/1/2004" "11/1/2003" "5/1/2004" ...
## $ publisher : chr [1:8472] "Scholastic Inc." "Scholastic Inc." "Scholastic" "Scholastic Inc." ...
## - attr(*, "problems")= tibble [21 × 5] (S3: tbl_df/tbl/data.frame)
## ..$ row : int [1:21] 1570 1570 3349 3349 3349 4513 4513 4513 4513 4513 ...
## ..$ col : chr [1:21] "title" "title" "average_rating" "num_pages" ...
## ..$ expected: chr [1:21] "delimiter or quote" "delimiter or quote" "a double" "a double" ...
## ..$ actual : chr [1:21] " " "I" "Jr./Sam B. Warner" "en-US" ...
## ..$ file : chr [1:21] "'data/books.csv'" "'data/books.csv'" "'data/books.csv'" "'data/books.csv'" ...
## - attr(*, "spec")=
## .. cols(
## .. bookID = col_double(),
## .. title = col_character(),
## .. authors = col_character(),
## .. average_rating = col_double(),
## .. isbn = col_character(),
## .. isbn13 = col_character(),
## .. language_code = col_character(),
## .. num_pages = col_double(),
## .. ratings_count = col_double(),
## .. text_reviews_count = col_double(),
## .. publication_date = col_character(),
## .. publisher = col_character()
## .. )
names(books)
## [1] "bookID" "title" "authors"
## [4] "average_rating" "isbn" "isbn13"
## [7] "language_code" "num_pages" "ratings_count"
## [10] "text_reviews_count" "publication_date" "publisher"
books
Searching for na values
books %>%
filter(is.na(authors))
books %>%
filter(is.na(average_rating))
Renaming columns
rename_books_col <- books %>%
rename(book_id = bookID)
Removing na values.
books_no_na <- rename_books_col %>%
filter_at(vars(book_id:publisher),
all_vars(!is.na(.)))
Top 10 highest rated books, with over 100 rating_count.
top_rated_books <- books_no_na %>%
select(title, authors, average_rating, ratings_count) %>%
arrange(desc(average_rating)) %>%
filter(ratings_count >= 100)
head(10)
## [1] 10
top_rated_books
Books with over 1000 pages
over_two_thousand <- books_no_na %>%
select(title, num_pages) %>%
arrange(desc(num_pages)) %>%
filter(num_pages >= 2000)
over_two_thousand
Number of books by each author.
book_count <- books_no_na %>%
count(authors)
book_count
Average pages for a publisher
avg_page_pub <- books_no_na %>%
group_by(publisher) %>%
summarise(avg_num_pages = mean(num_pages))
## `summarise()` ungrouping output (override with `.groups` argument)
avg_page_pub